/* Copyright (c) 2003 The Nutch Organization. All rights reserved. */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
package net.nutch.db;
import java.io.*;
import java.util.*;
import java.util.logging.*;
import java.nio.channels.*;
import net.nutch.io.*;
import net.nutch.util.*;
import net.nutch.pagedb.*;
import net.nutch.linkdb.*;
/***************************************************
* This is a wrapper class that allows us to reorder
* write operations to the linkdb and pagedb. It is
* useful only for objects like UpdateDatabaseTool,
* which just does writes.
*
* The WebDBWriter is a traditional single-pass database writer.
* It does not cache any instructions to disk (but it does
* in memory, with possible resorting). It certainly does
* nothing in a distributed fashion.
*
* There are other implementors of IWebDBWriter that do
* all that fancy stuff.
*
* @author Mike Cafarella
*************************************************/
public class DistributedWebDBWriter implements IWebDBWriter {
static final Logger LOG = LogFormatter.getLogger("net.nutch.db.WebDBWriter");
static final byte CUR_VERSION = 0;
static final byte OPEN_COUNTER_VERSION = 0;
static final byte CLOSE_COUNTER_VERSION = 0;
static final byte MACHINE_INFO_VERSION = 0;
// magic number
static int READY_TO_USE = 0xbabecafe;
static int IS_COMPLETE = 0xbabe0000;
static int WRITE_LOCK_INFO = 0xcafe0000;
static long LONG_TIMEOUT = 10 * 1000;
// db opcodes
static final byte ADD_PAGE = 0;
static final byte ADD_PAGE_WITH_SCORE = 1;
static final byte ADD_PAGE_IFN_PRESENT = 2;
static final byte DEL_PAGE = 3;
static final int ADD_LINK = 0;
static final int DEL_LINK = 1;
static final int DEL_SINGLE_LINK = 2;
// filenames
static final String PAGES_BY_URL = "pagesByURL";
static final String PAGES_BY_MD5 = "pagesByMD5";
static final String LINKS_BY_URL = "linksByURL";
static final String LINKS_BY_MD5 = "linksByMD5";
static final String STATS_FILE = "stats";
static final String META_SHAREGROUP = "metashare";
static final String METAINFO = "metainfo";
// Result codes for page-url comparisons
static final int NO_OUTLINKS = 0;
static final int HAS_OUTLINKS = 1;
static final int LINK_INVALID = 2;
/********************************************
* PageInstruction holds an operation over a Page.
*********************************************/
public static class PageInstruction implements WritableComparable {
byte opcode;
boolean hasLink;
Page page;
Link link;
/**
*/
public PageInstruction() {}
/**
*/
public PageInstruction(Page page, int opcode) {
set(page, opcode);
}
/**
*/
public PageInstruction(Page page, Link link, int opcode) {
set(page, link, opcode);
}
/**
* Init from another PageInstruction object.
*/
public void set(PageInstruction that) {
this.opcode = that.opcode;
if (this.page == null) {
this.page = new Page();
}
this.page.set(that.page);
if (this.link == null) {
this.link = new Link();
}
this.hasLink = that.hasLink;
if (this.hasLink) {
this.link.set(that.link);
}
}
/**
* Init PageInstruction with no Link
*/
public void set(Page page, int opcode) {
this.opcode = (byte) opcode;
this.page = page;
this.hasLink = false;
this.link = null;
}
/**
* Init PageInstruction with a Link
*/
public void set(Page page, Link link, int opcode) {
this.opcode = (byte) opcode;
this.page = page;
this.hasLink = true;
this.link = link;
}
//
// WritableComparable
//
public int compareTo(Object o) {
int pageResult = this.page.compareTo(((PageInstruction) o).page);
if (pageResult != 0) {
return pageResult;
} else {
return this.opcode - (((PageInstruction) o).opcode);
}
}
public void write(DataOutput out) throws IOException {
out.writeByte(opcode);
page.write(out);
out.writeByte(hasLink ? 1 : 0);
if (hasLink) {
link.write(out);
}
}
public void readFields(DataInput in) throws IOException {
opcode = in.readByte();
if (page == null) {
page = new Page();
}
page.readFields(in);
if (link == null) {
link = new Link();
}
hasLink = (1 == in.readByte());
if (hasLink) {
link.readFields(in);
}
}
public Page getPage() {
return page;
}
public Link getLink() {
if (hasLink) {
return link;
} else {
return null;
}
}
public int getInstruction() {
return opcode;
}
/**
* Sorts the instruction first by Page, then by opcode.
*/
public static class PageComparator extends WritableComparator {
private static final Page.Comparator PAGE_COMPARATOR =
new Page.Comparator();
public PageComparator() { super(PageInstruction.class); }
/** Optimized comparator. */
public int compare(byte[] b1, int s1, int l1,
byte[] b2, int s2, int l2) {
int opcode1 = b1[s1];
int opcode2 = b2[s2];
int c = PAGE_COMPARATOR.compare(b1, s1+1, l1-1, b2, s2+1, l2-1);
if (c != 0)
return c;
return opcode1 - opcode2;
}
}
/*****************************************************
* Sorts the instruction first by url, then by opcode.
*****************************************************/
public static class UrlComparator extends WritableComparator {
private static final Page.UrlComparator PAGE_COMPARATOR =
new Page.UrlComparator();
public UrlComparator() { super(PageInstruction.class); }
/**
* We need to sort by ordered URLs. First, we sort by
* URL, then by opcode.
*/
public int compare(WritableComparable a, WritableComparable b) {
PageInstruction instructionA = (PageInstruction)a;
PageInstruction instructionB = (PageInstruction)b;
Page pageA = instructionA.getPage();
Page pageB = instructionB.getPage();
int result = pageA.getURL().compareTo(pageB.getURL());
if (result != 0) {
return result;
} else {
return instructionA.opcode - instructionB.opcode;
}
}
/**
* Optimized comparator.
*/
public int compare(byte[] b1, int s1, int l1,
byte[] b2, int s2, int l2) {
int opcode1 = b1[s1];
int opcode2 = b2[s2];
int c = PAGE_COMPARATOR.compare(b1, s1+1, l1-1, b2, s2+1, l2-1);
if (c != 0)
return c;
return opcode1 - opcode2;
}
}
}
/********************************************************
* PageInstructionWriter very efficiently writes a
* PageInstruction to an EditSectionGroupWriter. Much better
* than calling "writer.append(new PageInstruction())"
********************************************************/
public static class PageInstructionWriter {
PageInstruction pi = new PageInstruction();
/**
*/
public PageInstructionWriter() {
}
/**
* Append the PageInstruction info to the indicated SequenceFile,
* and keep the PI for later reuse.
*/
public synchronized void appendInstructionInfo(EditSectionGroupWriter writer, Page page, int opcode, Writable val) throws IOException {
pi.set(page, opcode);
writer.append(pi, val);
}
/**
* Append the PageInstruction info to the indicated SequenceFile,
* and keep the PI for later reuse.
*/
public synchronized void appendInstructionInfo(EditSectionGroupWriter writer, Page page, Link link, int opcode, Writable val) throws IOException {
pi.set(page, link, opcode);
writer.append(pi, val);
}
}
/*************************************************************
* Reduce multiple instructions for a given url to the single effective
* instruction. ADD is prioritized highest, then ADD_IFN_PRESENT, and then
* DEL. Not coincidentally, this is opposite the order they're sorted in.
**************************************************************/
private static class DeduplicatingPageSequenceReader {
SequenceFile.Reader edits;
PageInstruction current = new PageInstruction();
UTF8 currentUrl = new UTF8();
boolean haveCurrent;
/**
*/
public DeduplicatingPageSequenceReader(SequenceFile.Reader edits) throws IOException {
this.edits = edits;
this.haveCurrent = edits.next(current, NullWritable.get());
}
/**
*/
public boolean next(PageInstruction result) throws IOException {
if (!haveCurrent) {
return false;
}
currentUrl.set(current.getPage().getURL());
result.set(current); // take the first instruction
do {
// skip the rest
} while ((haveCurrent = edits.next(current, NullWritable.get())) &&
currentUrl.compareTo(current.getPage().getURL()) == 0);
return true;
}
}
/*************************************************
* Holds an instruction over a Link.
*************************************************/
public static class LinkInstruction implements WritableComparable {
Link link;
int instruction;
/**
*/
public LinkInstruction() {
}
/**
*/
public LinkInstruction(Link link, int instruction) {
set(link, instruction);
}
/**
* Re-init from another LinkInstruction's info.
*/
public void set(LinkInstruction that) {
this.instruction = that.instruction;
if (this.link == null)
this.link = new Link();
this.link.set(that.link);
}
/**
* Re-init with a Link and an instruction
*/
public void set(Link link, int instruction) {
this.link = link;
this.instruction = instruction;
}
//
// WritableComparable
//
public int compareTo(Object o) {
return this.link.compareTo(((LinkInstruction) o).link);
}
public void write(DataOutput out) throws IOException {
out.writeByte(instruction);
link.write(out);
}
public void readFields(DataInput in) throws IOException {
this.instruction = in.readByte();
if (link == null)
link = new Link();
link.readFields(in);
}
public Link getLink() {
return link;
}
public int getInstruction() {
return instruction;
}
/*******************************************************
* Sorts the instruction first by Md5, then by opcode.
*******************************************************/
public static class MD5Comparator extends WritableComparator {
private static final Link.MD5Comparator MD5_COMPARATOR =
new Link.MD5Comparator();
public MD5Comparator() { super(LinkInstruction.class); }
public int compare(WritableComparable a, WritableComparable b) {
LinkInstruction instructionA = (LinkInstruction)a;
LinkInstruction instructionB = (LinkInstruction)b;
return instructionA.link.md5Compare(instructionB.link);
}
/** Optimized comparator. */
public int compare(byte[] b1, int s1, int l1,
byte[] b2, int s2, int l2) {
return MD5_COMPARATOR.compare(b1, s1+1, l1-1, b2, s2+1, l2-1);
}
}
/*********************************************************
* Sorts the instruction first by url, then by opcode.
*********************************************************/
public static class UrlComparator extends WritableComparator {
private static final Link.UrlComparator URL_COMPARATOR =
new Link.UrlComparator();
public UrlComparator() { super(LinkInstruction.class); }
public int compare(WritableComparable a, WritableComparable b) {
LinkInstruction instructionA = (LinkInstruction)a;
LinkInstruction instructionB = (LinkInstruction)b;
return instructionA.link.urlCompare(instructionB.link);
}
/**
* Optimized comparator.
*/
public int compare(byte[] b1, int s1, int l1,
byte[] b2, int s2, int l2) {
return URL_COMPARATOR.compare(b1, s1+1, l1-1, b2, s2+1, l2-1);
}
}
}
/*******************************************************
* LinkInstructionWriter very efficiently writes a
* LinkInstruction to an EditSectionGroupWriter. Much better
* than calling "writer.append(new LinkInstruction())"
********************************************************/
public static class LinkInstructionWriter {
LinkInstruction li = new LinkInstruction();
/**
*/
public LinkInstructionWriter() {
}
/**
* Append the LinkInstruction info to the indicated SequenceFile
* and keep the LI for later reuse.
*/
public synchronized void appendInstructionInfo(EditSectionGroupWriter writer, Link link, int opcode, Writable val) throws IOException {
li.set(link, opcode);
writer.append(li, val);
}
}
/********************************************************
* This class deduplicates link operations. We want to
* sort by MD5, then by URL. But all operations
* should be unique.
*********************************************************/
class DeduplicatingLinkSequenceReader {
Link currentKey = new Link();
LinkInstruction current = new LinkInstruction();
SequenceFile.Reader edits;
boolean haveCurrent;
/**
*/
public DeduplicatingLinkSequenceReader(SequenceFile.Reader edits) throws IOException {
this.edits = edits;
this.haveCurrent = edits.next(current, NullWritable.get());
}
/**
* The incoming stream of edits is sorted first by MD5, then by URL.
* MD5-only values always come before MD5+URL.
*/
public boolean next(LinkInstruction key) throws IOException {
if (! haveCurrent) {
return false;
}
currentKey.set(current.getLink());
do {
key.set(current);
} while ((haveCurrent = edits.next(current, NullWritable.get())) &&
currentKey.compareTo(current.getLink()) == 0);
return true;
}
}
/**************************************************
* The CloseProcessor class is used when we close down
* the webdb. We give it the path, members, and class values
* needed to apply changes to any of our 4 data tables.
*
* This is an abstract class. Each subclass must define
* the exact merge procedure. However, file-handling
* and edit-processing is standardized as much as possible.
*
**************************************************/
private abstract class CloseProcessor {
String basename;
String curDBPart;
MapFile.Reader oldDb;
EditSectionGroupWriter editWriter;
SequenceFile.Sorter sorter;
WritableComparator comparator;
Class keyClass, valueClass;
long itemsWritten = 0;
/**
* Store away these members for later use.
*/
CloseProcessor(String basename, MapFile.Reader oldDb, EditSectionGroupWriter editWriter, SequenceFile.Sorter sorter, WritableComparator comparator, Class keyClass, Class valueClass, String curDBPart) {
this.basename = basename;
this.oldDb = oldDb;
this.editWriter = editWriter;
this.sorter = sorter;
this.comparator = comparator;
this.keyClass = keyClass;
this.valueClass = valueClass;
this.curDBPart = curDBPart;
}
/**
* Perform the shutdown sequence for this Processor.
* There is a lot of file-moving and edit-sorting that
* is common across all the 4 tables.
*
* Returns how many items were written out by this close().
*/
long closeDown(NutchFile workingDir, NutchFile outputDir) throws IOException {
//
// Done adding edits, so close edit-writer.
//
editWriter.close();
//
// Where the output is going
//
NutchFile sectionDir = new NutchFile(outputDir, "dbsection." + machineNum);
NutchFile newDbNF = new NutchFile(sectionDir, basename);
//
// Grab all the edits that we need to process. We build an EditSectionGroupReader
// and aim it at the right location. The ESR will wait until all its
// component Sections are written and completed before returning from
// any method (other than the constructor). So we expect to possibly wait
// inside the call to numEdits().
//
EditSectionGroupReader edits = new EditSectionGroupReader(nutchfs, dbName, basename, machineNum, totalMachines);
int numEdits = edits.numEdits();
// If there are edits, then process them.
if (numEdits != 0) {
File mergedEditsFile = edits.mergeSectionComponents();
File sortedEditsFile = new File(mergedEditsFile.getPath() + ".sorted");
// Sort the edits
long startSort = System.currentTimeMillis();
sorter.sort(mergedEditsFile.getPath(), sortedEditsFile.getPath());
long endSort = System.currentTimeMillis();
LOG.info("Processing " + basename + ": Sorted " + numEdits + " instructions in " + ((endSort - startSort) / 1000.0) + " seconds.");
LOG.info("Processing " + basename + ": Sorted " + (numEdits / ((endSort - startSort) / 1000.0)) + " instructions/second");
// Delete old file
mergedEditsFile.delete();
// Read the sorted edits. That means read all
// the edits from the local subsection of the
// database. We must merge every machine's
// contribution to the edit-list first (which
// also means waiting until each machine has
// completed that step).
// Read the sorted edits
SequenceFile.Reader sortedEdits = new SequenceFile.Reader(sortedEditsFile.getPath());
// Create a brand-new output db for the integrated data
File newDbFile = nutchfs.getWorkingFile();
MapFile.Writer newDb = (comparator == null) ? new MapFile.Writer(newDbFile.getPath(), keyClass, valueClass) : new MapFile.Writer(newDbFile.getPath(), comparator, valueClass);
// Iterate through the edits, and merge changes with existing
// db into the brand-new file
oldDb.reset();
// Merge the edits. We did it!
long startMerge = System.currentTimeMillis();
mergeEdits(oldDb, sortedEdits, newDb);
long endMerge = System.currentTimeMillis();
LOG.info("Processing " + basename + ": Merged to new DB containing " + itemsWritten + " records in " + ((endMerge - startMerge) / 1000.0) + " seconds");
LOG.info("Processing " + basename + ": Merged " + (itemsWritten / ((endMerge - startMerge) / 1000.0)) + " records/second");
// Close down readers, writers
sortedEdits.close();
newDb.close();
// Delete the (sorted) merged-edits
sortedEditsFile.delete();
// Store the newly-written db file
nutchfs.put(newDbNF, newDbFile, true);
} else {
// Otherwise, simply copy the original file into place,
// without all the processing overhead.
long startCopy = System.currentTimeMillis();
NutchFile srcSectionDir = new NutchFile(dbDir, "dbsection." + machineNum);
NutchFile srcDbNF = new NutchFile(srcSectionDir, basename);
File srcDbFile = nutchfs.get(srcDbNF);
nutchfs.put(newDbNF, srcDbFile, true);
long endCopy = System.currentTimeMillis();
LOG.info("Processing " + basename + ": Copied file (" + srcDbFile.length()+ " bytes) in " + ((endCopy - startCopy) / 1000.0) + " secs.");
}
// Delete the now-consumed edits file to save space
edits.delete();
return itemsWritten;
}
/**
* The loop that actually applies the changes and writes to
* a new db. This is different for every subclass!
*/
abstract void mergeEdits(MapFile.Reader db, SequenceFile.Reader edits, MapFile.Writer newDb) throws IOException;
}
/***
* The PagesByURLProcessor is used during close() time for
* the pagesByURL table. We instantiate one of these, and it
* takes care of the entire shutdown process.
*/
private class PagesByURLProcessor extends CloseProcessor {
EditSectionGroupWriter futureEdits;
/**
* We store "futureEdits" so we can write edits for the
* next table-db step
*/
PagesByURLProcessor(MapFile.Reader db, EditSectionGroupWriter editWriter, EditSectionGroupWriter futureEdits) {
super(PAGES_BY_URL, db, editWriter, new SequenceFile.Sorter(new PageInstruction.UrlComparator(), NullWritable.class), new UTF8.Comparator(), null, Page.class, "PagesByURLPart");
this.futureEdits = futureEdits;
}
/**
* Merge the existing db with the edit-stream into a brand-new file.
*/
void mergeEdits(MapFile.Reader db, SequenceFile.Reader sortedEdits, MapFile.Writer newDb) throws IOException {
// Create the keys and vals we'll be using
DeduplicatingPageSequenceReader edits = new DeduplicatingPageSequenceReader(sortedEdits);
WritableComparable readerKey = new UTF8();
Page readerVal = new Page();
PageInstruction editItem = new PageInstruction();
int futureOrdering = 0;
// Read the first items from both streams
boolean hasEntries = db.next(readerKey, readerVal);
boolean hasEdits = edits.next(editItem);
// As long as we have both edits and entries, we need to
// interleave them....
while (hasEntries && hasEdits) {
int comparison = readerKey.compareTo(editItem.getPage().getURL());
int curInstruction = editItem.getInstruction();
// Perform operations
if ((curInstruction == ADD_PAGE) ||
(curInstruction == ADD_PAGE_WITH_SCORE) ||
(curInstruction == ADD_PAGE_IFN_PRESENT)) {
if (comparison < 0) {
// Write readerKey, just passing it along.
// Don't process the edit yet.
newDb.append(readerKey, readerVal);
itemsWritten++;
hasEntries = db.next(readerKey, readerVal);
} else if (comparison == 0) {
// The keys are equal. If the instruction
// is ADD_PAGE, we write the edit's key and
// replace the old one.
//
// Otherwise, if it's ADD_IFN_PRESENT,
// keep the reader's item intact.
//
if ((curInstruction == ADD_PAGE) ||
(curInstruction == ADD_PAGE_WITH_SCORE)) {
// An ADD_PAGE with an identical pair
// of pages replaces the existing one.
// We may need to note the fact for
// Garbage Collection.
//
// This happens in three stages.
// 1. We write necessary items to the future
// edits-list.
//
pagesByMD5Edits++;
// If this is a replacing add, we don't want
// to disturb the score from the old Page! This,
// way, we can run some link analysis scoring
// while the new Pages are being fetched and
// not lose the info when a Page is replaced.
//
// If it is an ADD_PAGE_WITH_SCORE, then we
// go ahead and replace the old one.
//
// Either way, from now on we treat it
// as an ADD_PAGE
//
Page editItemPage = editItem.getPage();
if (curInstruction == ADD_PAGE) {
editItemPage.setScore(readerVal.getScore(), readerVal.getNextScore());
}
piwriter.appendInstructionInfo(futureEdits, editItemPage, ADD_PAGE, NullWritable.get());
//
// 2. We write the edit-page to *this* table.
//
newDb.append(editItemPage.getURL(), editItemPage);
//
// 3. We want the ADD in the next step (the
// MD5-driven table) to be a "replacing add".
// But that won't happen if the readerItem and
// the editItem Pages are not identical.
// (In this scenario, that means their URLs
// are the same, but their MD5s are different.)
// So, we need to explicitly handle that
// case by issuing a DELETE for the now-obsolete
// item.
if (editItemPage.compareTo(readerVal) != 0) {
pagesByMD5Edits++;
piwriter.appendInstructionInfo(futureEdits, readerVal, DEL_PAGE, NullWritable.get());
}
itemsWritten++;
// "Delete" the readerVal by skipping it.
hasEntries = db.next(readerKey, readerVal);
} else {
// ADD_PAGE_IFN_PRESENT. We only add IF_NOT
// present. And it was present! So, we treat
// this case like we treat a no-op.
// Just move to the next edit.
}
// In either case, we process the edit.
hasEdits = edits.next(editItem);
} else if (comparison > 0) {
// We have inserted a Page that's before some
// entry in the existing database. So, we just
// need to write down the Page from the Edit file.
// It's like the above case, except we don't tell
// the future-edits to delete anything.
//
// 1. Write the item down for the future.
pagesByMD5Edits++;
//
// If this is an ADD_PAGE_IFN_PRESENT, then
// we may also have a Link we have to take care of!
//
if (curInstruction == ADD_PAGE_IFN_PRESENT) {
Link editLink = editItem.getLink();
if (editLink != null) {
addLink(editLink);
}
}
piwriter.appendInstructionInfo(futureEdits, editItem.getPage(), ADD_PAGE, NullWritable.get());
//
// 2. Write the edit-page to *this* table
newDb.append(editItem.getPage().getURL(), editItem.getPage());
itemsWritten++;
// Process the edit
hasEdits = edits.next(editItem);
}
} else if (curInstruction == DEL_PAGE) {
if (comparison < 0) {
// Write the readerKey, just passing it along.
// We don't process the edit yet.
newDb.append(readerKey, readerVal);
itemsWritten++;
hasEntries = db.next(readerKey, readerVal);
} else if (comparison == 0) {
// Delete it! We can only delete one item
// at a time, as all URLs are unique.
// 1. Tell the future-edits what page will need to
// be deleted.
pagesByMD5Edits++;
piwriter.appendInstructionInfo(futureEdits, readerVal, DEL_PAGE, NullWritable.get());
//
// 2. "Delete" the entry by skipping the Reader
// key.
hasEntries = db.next(readerKey, readerVal);
// Process the edit
hasEdits = edits.next(editItem);
} else if (comparison > 0) {
// Ignore it. We tried to delete an item that's
// not here.
hasEdits = edits.next(editItem);
}
}
}
// Now we have only edits. No more preexisting items!
while (! hasEntries && hasEdits) {
int curInstruction = editItem.getInstruction();
if (curInstruction == ADD_PAGE ||
curInstruction == ADD_PAGE_WITH_SCORE ||
curInstruction == ADD_PAGE_IFN_PRESENT) {
// No more reader entries, so ADD_PAGE_IFN_PRESENT
// is treated like a simple ADD_PAGE.
// 1. Tell the future edits-list about this new item
pagesByMD5Edits++;
piwriter.appendInstructionInfo(futureEdits, editItem.getPage(), ADD_PAGE, NullWritable.get());
// 2. Write the edit page to this table.
newDb.append(editItem.getPage().getURL(), editItem.getPage());
itemsWritten++;
} else if (curInstruction == DEL_PAGE) {
// Ignore it. We tried to delete an item
// that's not here.
}
// Either way, we always process the edit.
hasEdits = edits.next(editItem);
}
// Now we have only preexisting items. We just copy
// them to the new file, in order.
while (hasEntries && ! hasEdits) {
newDb.append(readerKey, readerVal);
itemsWritten++;
hasEntries = db.next(readerKey, readerVal);
}
}
}
/***
* The PagesByMD5Processor is used during close() time for
* the pagesByMD5 table. We instantiate one of these, and it
* takes care of the entire shutdown process.
*/
private class PagesByMD5Processor extends CloseProcessor {
/**
*/
PagesByMD5Processor(MapFile.Reader db, EditSectionGroupWriter editWriter) {
super(PAGES_BY_MD5, db, editWriter, new SequenceFile.Sorter(new PageInstruction.PageComparator(), NullWritable.class), null, Page.class, NullWritable.class, "PagesByMD5Part");
}
/**
*/
void mergeEdits(MapFile.Reader db, SequenceFile.Reader sortedEdits, MapFile.Writer newDb) throws IOException {
// Create the keys and vals
Page readerItem = new Page();
PageInstruction editItem = new PageInstruction();
// For computing the GC list
Page deletedItem = new Page(), lastItem = new Page();
boolean justDeletedItem = false;
boolean newReaderItem = false;
int itemRepeats = 0;
// Read the first items from both streams
boolean hasEntries = db.next(readerItem, NullWritable.get());
boolean hasEdits = sortedEdits.next(editItem, NullWritable.get());
if (hasEntries) {
// The first thing we read should become
// the "previous key". We need this for
// garbage collection.
outBuf.reset();
readerItem.write(outBuf);
inBuf.reset(outBuf.getData(), outBuf.getLength());
lastItem.readFields(inBuf);
itemRepeats = 0;
}
// As long we have both edits and entries, we need to
// interleave them.
while (hasEdits && hasEntries) {
int comparison = readerItem.compareTo(editItem.getPage());
int curInstruction = editItem.getInstruction();
//
// OK! Now perform operations
//
if (curInstruction == ADD_PAGE) {
if (comparison < 0) {
// Write readerItem, just passing it along.
// Don't process the edit yet.
newDb.append(readerItem, NullWritable.get());
itemsWritten++;
hasEntries = db.next(readerItem, NullWritable.get());
newReaderItem = true;
} else if (comparison == 0) {
//
// This is a "replacing ADD", which is generated
// by the above-sequence. We should skip over the
// existing item, and add the new one instead.
//
// Note that by this point, the new version of the
// Page from the edit sequence is guaranteed to
// have the correct score. We make sure of it in
// the mergeEdits() for PagesByURLProcessor.
//
newDb.append(editItem.getPage(), NullWritable.get());
itemsWritten++;
hasEntries = db.next(readerItem, NullWritable.get());
newReaderItem = true;
hasEdits = sortedEdits.next(editItem, NullWritable.get());
} else if (comparison > 0) {
// Write the edit item. We've inserted an item
// that comes before any others.
newDb.append(editItem.getPage(), NullWritable.get());
itemsWritten++;
hasEdits = sortedEdits.next(editItem, NullWritable.get());
}
} else if (curInstruction == ADD_PAGE_IFN_PRESENT) {
throw new IOException("Should never process ADD_PAGE_IFN_PRESENT for the index: " + editItem);
} else if (curInstruction == DEL_PAGE) {
if (comparison < 0) {
// Write the readerKey, just passing it along.
// Don't process the edit yet.
newDb.append(readerItem, NullWritable.get());
itemsWritten++;
hasEntries = db.next(readerItem, NullWritable.get());
newReaderItem = true;
} else if (comparison == 0) {
// Delete it! Remember only one entry can
// be deleted at a time!
//
// "Delete" the entry by skipping over the reader
// item. We move onto the next item in the existing
// index, as well as the next edit instruction.
hasEntries = db.next(readerItem, NullWritable.get());
newReaderItem = true;
hasEdits = sortedEdits.next(editItem, NullWritable.get());
// We need to set this flag for GC'ing.
justDeletedItem = true;
} else if (comparison > 0) {
// This should never happen! We should only be
// deleting items that actually appear!
throw new IOException("An unapplicable DEL_PAGE should never appear during index-merge: " + editItem);
}
}
// GARBAGE COLLECTION
// We want to detect when we have deleted the
// last MD5 of a certain value. We can have
// multiple MD5s in the same index, as long as
// they have different URLs. When the last MD5
// is deleted, we want to know so we can modify
// the LinkDB.
if (newReaderItem) {
// If we have a different readerItem which is just
// the same as our last one, then we know it's a
// repeat!
if (hasEntries && readerItem.getMD5().compareTo(lastItem.getMD5()) == 0) {
itemRepeats++;
} else {
// The current readerItem and the lastItem
// MD5s are not equal.
//
// If the last item was deleted, AND if the
// deleted item is not a repeat of the current item,
// then that MD5 should be garbage collected.
if (justDeletedItem && itemRepeats == 0) {
deleteLink(lastItem.getMD5());
}
// The current readerItem is the new "last key".
outBuf.reset();
readerItem.write(outBuf);
inBuf.reset(outBuf.getData(), outBuf.getLength());
lastItem.readFields(inBuf);
itemRepeats = 0;
}
// Clear "new-reader-item" bit
newReaderItem = false;
}
// Clear "last-deleted" bit
justDeletedItem = false;
}
// Now we have only edits. No more preexisting items!
while (! hasEntries && hasEdits) {
int curInstruction = editItem.getInstruction();
if (curInstruction == ADD_PAGE) {
// Just write down the new page!
newDb.append(editItem.getPage(), NullWritable.get());
itemsWritten++;
} else if (curInstruction == ADD_PAGE_IFN_PRESENT) {
throw new IOException("Should never process ADD_PAGE_IFN_PRESENT for the index: " + editItem);
} else if (curInstruction == DEL_PAGE) {
// This should never happen! We should only be
// deleting items that actually appear!
throw new IOException("An unapplicable DEL_PAGE should never appear during index-merge: " + editItem);
}
hasEdits = sortedEdits.next(editItem, NullWritable.get());
}
// Now we have only preexisting items. We just copy them
// to the new file, in order
while (hasEntries && ! hasEdits) {
// Simply copy through the remaining database items
newDb.append(readerItem, NullWritable.get());
itemsWritten++;
hasEntries = db.next(readerItem, NullWritable.get());
newReaderItem = true;
}
}
}
/**
* The LinksByMD5Processor is used during close() for
* the pagesByMD5 table. It processes all the edits to
* this table, and also generates edits for the linksByURL
* table.
*/
private class LinksByMD5Processor extends CloseProcessor {
EditSectionGroupWriter futureEdits;
/**
*/
public LinksByMD5Processor(MapFile.Reader db, EditSectionGroupWriter editWriter, EditSectionGroupWriter futureEdits) {
super(LINKS_BY_MD5, db, editWriter, new SequenceFile.Sorter(new LinkInstruction.MD5Comparator(), NullWritable.class), new Link.MD5Comparator(), Link.class, NullWritable.class, "LinksByMD5Part");
this.futureEdits = futureEdits;
}
/**
* Merges edits into the md5-driven link table. Also generates
* edit sequence to apply to the URL-driven table.
*/
void mergeEdits(MapFile.Reader db, SequenceFile.Reader sortedEdits, MapFile.Writer newDb) throws IOException {
WritableComparator comparator = new Link.MD5Comparator();
DeduplicatingLinkSequenceReader edits = new DeduplicatingLinkSequenceReader(sortedEdits);
// Create the keys and vals we'll use
LinkInstruction editItem = new LinkInstruction();
Link readerItem = new Link();
// Read the first items from both streams
boolean hasEntries = db.next(readerItem, NullWritable.get());
boolean hasEdits = edits.next(editItem);
// As long as we have both edits and entries to process,
// we need to interleave them
while (hasEntries && hasEdits) {
int curInstruction = editItem.getInstruction();
// Perform operations
if (curInstruction == ADD_LINK) {
// When we add a link, we may replace a previous
// link with identical URL and MD5 values. The
// MD5FirstComparator will use both values.
//
int comparison = comparator.compare(readerItem, editItem.getLink());
if (comparison < 0) {
// Write the readerKey, just passing it along.
// Don't process the edit yet.
newDb.append(readerItem, NullWritable.get());
itemsWritten++;
hasEntries = db.next(readerItem, NullWritable.get());
} else if (comparison == 0) {
// 1. Write down the item for table-edits
if (futureEdits != null) {
linksByURLEdits++;
liwriter.appendInstructionInfo(futureEdits, editItem.getLink(), ADD_LINK, NullWritable.get());
}
// 2. Write the new item, "replacing" the old one.
// We move to the next edit instruction and move
// past the replaced db entry.
newDb.append(editItem.getLink(), NullWritable.get());
itemsWritten++;
hasEntries = db.next(readerItem, NullWritable.get());
hasEdits = edits.next(editItem);
} else if (comparison > 0) {
// 1. Write down the item for table-edits
if (futureEdits != null) {
linksByURLEdits++;
liwriter.appendInstructionInfo(futureEdits, editItem.getLink(), ADD_LINK, NullWritable.get());
}
// 2. Write the new item. We stay at the current
// db entry.
newDb.append(editItem.getLink(), NullWritable.get());
itemsWritten++;
hasEdits = edits.next(editItem);
}
} else if ((curInstruction == DEL_LINK) ||
(curInstruction == DEL_SINGLE_LINK)) {
// When we delete a link, we might delete many
// at once! We are interested only in the MD5
// here. If there are entries with identical MD5
// values, but different URLs, we get rid of them
// all.
int comparison = 0;
if (curInstruction == DEL_LINK) {
comparison = readerItem.getFromID().compareTo(editItem.getLink().getFromID());
} else {
comparison = readerItem.md5Compare(editItem.getLink());
}
if (comparison < 0) {
// Write the readerKey, just passing it along.
// Don't process the edit yet.
newDb.append(readerItem, NullWritable.get());
itemsWritten++;
hasEntries = db.next(readerItem, NullWritable.get());
} else if (comparison == 0) {
// Delete it (or them!)
// 1. Write the full instruction for the next
// delete-stage. That includes the read-in
// value
// 2. "Delete" the entry by skipping the
// readerKey. We DO NOT go to the next edit
// instruction! There might still be more
// entries in the database to which we should
// apply this delete-edit.
//
// Step 1. Write entry for future table-edits
if (futureEdits != null) {
linksByURLEdits++;
liwriter.appendInstructionInfo(futureEdits, readerItem, DEL_LINK, NullWritable.get());
}
// Step 2.
// We might want to delete multiple MD5s with
// a single delete() operation, so keep this
// edit instruction around
hasEntries = db.next(readerItem, NullWritable.get());
if (curInstruction == DEL_SINGLE_LINK) {
hasEdits = edits.next(editItem);
}
} else if (comparison > 0) {
// Ignore, move on to next instruction
hasEdits = edits.next(editItem);
}
}
}
// Now we have only edits. No more preexisting items!
while (! hasEntries && hasEdits) {
int curInstruction = editItem.getInstruction();
if (curInstruction == ADD_LINK) {
// 1. Write down the item for future table-edits
if (futureEdits != null) {
linksByURLEdits++;
liwriter.appendInstructionInfo(futureEdits, editItem.getLink(), ADD_LINK, NullWritable.get());
}
// 2. Just add the item from the edit list
newDb.append(editItem.getLink(), NullWritable.get());
itemsWritten++;
} else if (curInstruction == DEL_LINK) {
// Ignore operation
}
// Move on to next edit
hasEdits = edits.next(editItem);
}
// Now we have only preexisting items. Just copy them
// to the new file, in order.
while (hasEntries && ! hasEdits) {
newDb.append(readerItem, NullWritable.get());
itemsWritten++;
hasEntries = db.next(readerItem, NullWritable.get());
}
}
}
/**
* This class helps the LinksByURLProcessor test a list of
* Page objects, sorted by URL, for outlink-counts. We query
* this class with a series of questions, based on Links sorted
* by target URL.
*/
private class TargetTester {
MapFile.Reader pagedb;
boolean hasPage = false;
UTF8 pageURL = null;
Page page = null;
/**
*/
public TargetTester(MapFile.Reader pagedb) throws IOException {
this.pagedb = pagedb;
this.pageURL = new UTF8();
this.page = new Page();
this.hasPage = pagedb.next(pageURL, page);
}
/**
* Match the given URL against the sorted series of Page URLs.
*/
public int hasOutlinks(UTF8 curURL) throws IOException {
int returnCode = NO_OUTLINKS;
int comparison = pageURL.compareTo(curURL);
while (hasPage && comparison < 0) {
hasPage = pagedb.next(pageURL, page);
if (hasPage) {
comparison = pageURL.compareTo(curURL);
}
}
if (hasPage) {
if (comparison == 0) {
returnCode = (page.getNumOutlinks() > 0) ? HAS_OUTLINKS : NO_OUTLINKS;
} else if (comparison > 0) {
//
// This situation indicates that the Link's
// target page has been deleted, probably
// because we repeatedly failed to fetch the URL.
// So, we should delete the Link.
//
returnCode = LINK_INVALID;
}
}
return returnCode;
}
/**
*/
public void close() throws IOException {
pagedb.close();
}
}
/**
* Closes down and merges changes to the URL-driven link
* table. This does nothing fancy, and propagates nothing
* to a further stage. There is no next stage!
*/
private class LinksByURLProcessor extends CloseProcessor {
MapFile.Reader pageDb;
EditSectionGroupWriter futureEdits;
/**
*/
public LinksByURLProcessor(MapFile.Reader db, EditSectionGroupWriter editWriter, MapFile.Reader pageDb, EditSectionGroupWriter futureEdits) {
super(LINKS_BY_URL, db, editWriter, new SequenceFile.Sorter(new LinkInstruction.UrlComparator(), NullWritable.class), new Link.UrlComparator(), Link.class, NullWritable.class, "LinksByURLPart");
this.pageDb = pageDb;
this.futureEdits = futureEdits;
}
/**
*/
public long closeDown(NutchFile workingDir, NutchFile outputDir) throws IOException {
long result = super.closeDown(workingDir, outputDir);
pageDb.close();
return result;
}
/**
* Merge the existing db with the edit-stream into a brand-new file.
*/
void mergeEdits(MapFile.Reader db, SequenceFile.Reader sortedEdits, MapFile.Writer newDb) throws IOException {
WritableComparator comparator = new Link.UrlComparator();
// Create the keys and vals we'll use
LinkInstruction editItem = new LinkInstruction();
Link readerItem = new Link();
// Read the first items from both streams
boolean hasEntries = db.next(readerItem, NullWritable.get());
boolean hasEdits = sortedEdits.next(editItem, NullWritable.get());
TargetTester targetTester = new TargetTester(pageDb);
// As long as we have both edits and entries to process,
// we need to interleave them
while (hasEntries && hasEdits) {
int curInstruction = editItem.getInstruction();
if (curInstruction == ADD_LINK) {
// When we add a link, we may replace a previous
// link with identical URL and MD5 values. Our
// comparator will test both
//
int comparison = comparator.compare(readerItem, editItem.getLink());
if (comparison < 0) {
// Write the readerKey, just passing it along.
// Don't process the edit yet.
int linkTest = targetTester.hasOutlinks(readerItem.getURL());
if (linkTest == LINK_INVALID) {
liwriter.appendInstructionInfo(futureEdits, readerItem, DEL_SINGLE_LINK, NullWritable.get());
targetOutlinkEdits++;
} else {
boolean oldOutlinkStatus = readerItem.targetHasOutlink();
boolean newOutlinkStatus = (linkTest == HAS_OUTLINKS);
// Do the conditional so we minimize unnecessary
// mod-writes.
if (oldOutlinkStatus != newOutlinkStatus) {
readerItem.setTargetHasOutlink(newOutlinkStatus);
liwriter.appendInstructionInfo(futureEdits, readerItem, ADD_LINK, NullWritable.get());
targetOutlinkEdits++;
}
newDb.append(readerItem, NullWritable.get());
itemsWritten++;
}
hasEntries = db.next(readerItem, NullWritable.get());
} else if (comparison == 0) {
// Write the new item, "replacing" the old one.
// We move to the next edit instruction and move
// past the replaced db entry.
Link editLink = editItem.getLink();
int linkTest = targetTester.hasOutlinks(editLink.getURL());
// Delete the edit/readerItem from the other table if it's
// found to be invalid.
if (linkTest == LINK_INVALID) {
liwriter.appendInstructionInfo(futureEdits, editLink, DEL_SINGLE_LINK, NullWritable.get());
} else {
editLink.setTargetHasOutlink(linkTest == HAS_OUTLINKS);
liwriter.appendInstructionInfo(futureEdits, editLink, ADD_LINK, NullWritable.get());
newDb.append(editLink, NullWritable.get());
itemsWritten++;
}
targetOutlinkEdits++;
hasEntries = db.next(readerItem, NullWritable.get());
hasEdits = sortedEdits.next(editItem, NullWritable.get());
} else if (comparison > 0) {
// Write the new item. We stay at the current
// db entry.
Link editLink = editItem.getLink();
int linkTest = targetTester.hasOutlinks(editLink.getURL());
// Delete the edit from the other table if it's invalid
if (linkTest == LINK_INVALID) {
liwriter.appendInstructionInfo(futureEdits, editLink, DEL_SINGLE_LINK, NullWritable.get());
} else {
editLink.setTargetHasOutlink(linkTest == HAS_OUTLINKS);
liwriter.appendInstructionInfo(futureEdits, editLink, ADD_LINK, NullWritable.get());
newDb.append(editLink, NullWritable.get());
itemsWritten++;
}
targetOutlinkEdits++;
hasEdits = sortedEdits.next(editItem, NullWritable.get());
}
} else if (curInstruction == DEL_LINK) {
// When we delete a link, we do it by MD5 and apply
// it to the index first. A single delete instruction
// may remove many items in the db, during the earlier
// processing. However, unlike the index-processing stage,
// here we can expect a new DEL instruction for every
// item that we remove from the db.
//
int comparison = comparator.compare(readerItem, editItem.getLink());
if (comparison < 0) {
// Write readerKey, just passing it along. Don't
// process the edit yet.
int linkTest = targetTester.hasOutlinks(readerItem.getURL());
// Delete the reader item if it's found to be invalid
if (linkTest == LINK_INVALID) {
liwriter.appendInstructionInfo(futureEdits, readerItem, DEL_SINGLE_LINK, NullWritable.get());
} else {
readerItem.setTargetHasOutlink(linkTest == HAS_OUTLINKS);
liwriter.appendInstructionInfo(futureEdits, readerItem, ADD_LINK, NullWritable.get());
newDb.append(readerItem, NullWritable.get());
itemsWritten++;
}
targetOutlinkEdits++;
hasEntries = db.next(readerItem, NullWritable.get());
} else if (comparison == 0) {
// "Delete" the item by passing by the readerKey.
// We want a new entry, as well as the next instruction
// to process.
hasEntries = db.next(readerItem, NullWritable.get());
hasEdits = sortedEdits.next(editItem, NullWritable.get());
} else if (comparison > 0) {
// Ignore, move on to next instruction
hasEdits = sortedEdits.next(editItem, NullWritable.get());
}
}
}
// Now we have only edits. No more preexisting items!
while (! hasEntries && hasEdits) {
int curInstruction = editItem.getInstruction();
if (curInstruction == ADD_LINK) {
//
// Add the item from the edit list.
//
//
// Make sure the outlinks flag is set properly.
//
Link editLink = editItem.getLink();
int linkTest = targetTester.hasOutlinks(editLink.getURL());
if (linkTest == LINK_INVALID) {
liwriter.appendInstructionInfo(futureEdits, editLink, DEL_SINGLE_LINK, NullWritable.get());
} else {
editLink.setTargetHasOutlink(linkTest == HAS_OUTLINKS);
liwriter.appendInstructionInfo(futureEdits, editLink, ADD_LINK, NullWritable.get());
newDb.append(editLink, NullWritable.get());
itemsWritten++;
}
targetOutlinkEdits++;
} else if (curInstruction == DEL_LINK) {
// Ignore operation
}
// Move on to next edit
hasEdits = sortedEdits.next(editItem, NullWritable.get());
}
// Now we have only preexisting items. Just copy them
// to the new file, in order.
while (hasEntries && ! hasEdits) {
//
// Simply copy the remaining database items.
//
//
// First, make sure the 'outlinks' flag is set properly.
//
int linkTest = targetTester.hasOutlinks(readerItem.getURL());
if (linkTest == LINK_INVALID) {
liwriter.appendInstructionInfo(futureEdits, readerItem, DEL_SINGLE_LINK, NullWritable.get());
targetOutlinkEdits++;
} else {
boolean oldOutlinkStatus = readerItem.targetHasOutlink();
boolean newOutlinkStatus = (linkTest == HAS_OUTLINKS);
if (oldOutlinkStatus != newOutlinkStatus) {
readerItem.setTargetHasOutlink(newOutlinkStatus);
liwriter.appendInstructionInfo(futureEdits, readerItem, ADD_LINK, NullWritable.get());
targetOutlinkEdits++;
}
// Now copy the object
newDb.append(readerItem, NullWritable.get());
itemsWritten++;
}
// Move on to next
hasEntries = db.next(readerItem, NullWritable.get());
}
targetTester.close();
}
}
/**
* Method useful for the first time we create a distributed db project.
* Basically need to write down the number of dirs we can expect.
*/
public static void createDB(NutchFileSystem nutchfs, String dbName, int totalMachines) throws IOException {
//
// Check to see if the db already exists
//
NutchFile machineInfo = new NutchFile(nutchfs, dbName, "standard", new File("machineinfo"));
if (nutchfs.get(machineInfo, LONG_TIMEOUT) != null) {
throw new IOException("Cannot create WebDB at nutchfs " + nutchfs + " with name " + dbName + ", as it already exists.");
}
//
// Write down how many machines live in the distributed pool
//
File machineInfoFile = nutchfs.getWorkingFile();
DataOutputStream out = new DataOutputStream(new FileOutputStream(machineInfoFile));
try {
out.write(MACHINE_INFO_VERSION);
out.writeInt(totalMachines);
} finally {
out.close();
}
nutchfs.put(machineInfo, machineInfoFile, true);
//
// Create the lower directory structures for each machine in pool.
//
for (int i = 0; i < totalMachines; i++) {
NutchFile dbDir = new NutchFile(nutchfs, dbName, "standard", new File("webdb"));
NutchFile sectionDir = new NutchFile(dbDir, "dbsection." + i);
NutchFile pagesByURLNF = new NutchFile(sectionDir, PAGES_BY_URL);
NutchFile pagesByMD5NF = new NutchFile(sectionDir, PAGES_BY_MD5);
NutchFile linksByURLNF = new NutchFile(sectionDir, LINKS_BY_URL);
NutchFile linksByMD5NF = new NutchFile(sectionDir, LINKS_BY_MD5);
File pagesByURLFile = nutchfs.getWorkingFile();
File pagesByMD5File = nutchfs.getWorkingFile();
File linksByURLFile = nutchfs.getWorkingFile();
File linksByMD5File = nutchfs.getWorkingFile();
//
// If we're creating the db, we make a zero-length file for each
// db file
//
new MapFile.Writer(pagesByURLFile.getPath(), new UTF8.Comparator(), Page.class).close();
new MapFile.Writer(pagesByMD5File.getPath(), new Page.Comparator(), NullWritable.class).close();
new MapFile.Writer(linksByURLFile.getPath(), new Link.UrlComparator(), NullWritable.class).close();
new MapFile.Writer(linksByMD5File.getPath(), new Link.MD5Comparator(), NullWritable.class).close();
nutchfs.put(pagesByURLNF, pagesByURLFile, true);
nutchfs.put(pagesByMD5NF, pagesByMD5File, true);
nutchfs.put(linksByURLNF, linksByURLFile, true);
nutchfs.put(linksByMD5NF, linksByMD5File, true);
}
//
// Create the "ready-to-use" flag that tells all subsequent
// WebDBWriters it's OK to proceed.
//
File readyToUseFile = nutchfs.getWorkingFile();
NutchFile readyToUse = new NutchFile(nutchfs, dbName, "standard", new File("readyToUse"));
out = new DataOutputStream(new FileOutputStream(readyToUseFile));
try {
out.writeInt(READY_TO_USE); // Magic number
} finally {
out.close();
}
nutchfs.put(readyToUse, readyToUseFile, false);
}
PageInstructionWriter piwriter = new PageInstructionWriter();
LinkInstructionWriter liwriter = new LinkInstructionWriter();
DataInputBuffer inBuf = new DataInputBuffer();
DataOutputBuffer outBuf = new DataOutputBuffer();
NutchFileSystem nutchfs;
String dbName;
NutchFile dbDir, oldDbDir, newDbDir, tmpDir;
NutchFile localWriteLock, globalWriteLock, closeCounter, openCounter;
EditSectionGroupWriter pagesByURLWriter, pagesByMD5Writer, linksByURLWriter, linksByMD5Writer;
MapFile.Reader pagesByURL, pagesByMD5, linksByURL, linksByMD5;
long pagesByURLEdits = 0, pagesByMD5Edits = 0, linksByURLEdits = 0, linksByMD5Edits = 0, targetOutlinkEdits = 0;
int machineNum, totalMachines;
/**
* Open the db files.
*/
public DistributedWebDBWriter(NutchFileSystem nutchfs, String dbName, int machineNum) throws IOException {
//
// Store the nutchfs. Build dir set.
//
this.nutchfs = nutchfs;
this.dbName = dbName;
this.machineNum = machineNum;
this.dbDir = new NutchFile(nutchfs, dbName, "standard", new File("webdb"));
this.oldDbDir = new NutchFile(nutchfs, dbName, "standard", new File("webdb.old"));
this.newDbDir = new NutchFile(nutchfs, dbName, "standard", new File("webdb.new"));
this.tmpDir = new NutchFile(newDbDir, "tmp");
//
// Wait indefinitely for "ready-to-use-flag".
//
NutchFile readyToUse = new NutchFile(nutchfs, dbName, "standard", new File("readyToUse"));
nutchfs.get(readyToUse);
//////////////////////////////////////////////////////////
// Locks
//////////////////////////////////////////////////////////
// 1. Each dbsection has a lock so only one writer ever accesses
// it at once. Lock the local one immediately.
this.localWriteLock = new NutchFile(nutchfs, dbName, "standard", new File("sectionLock." + machineNum));
nutchfs.lock(localWriteLock, true);
// 2. A global writeLock protects writers that need to make
// changes that affect many processors (such as moving dbDir or
// deleting tmp).
//
// Readers will obtain this lock non-exclusively. When it comes
// time for global changes to the db, writers will obtain it
// exclusively. Readers need to leave before these changes can
// be made.
this.globalWriteLock = new NutchFile(nutchfs, dbName, "standard", new File("globalWriteLock"));
// 3. Not quite a lock, but related: the closeCounter, which
// tracks how many processors have made it through the db close
// sequence. This is protected by globalWriteLock.
this.openCounter = new NutchFile(newDbDir, "openCounter");
this.closeCounter = new NutchFile(newDbDir, "closeCounter");
//////////////////////////////////////////////////////////
// Setup and Initialization
//////////////////////////////////////////////////////////
// Load # of machines
NutchFile machineInfo = new NutchFile(nutchfs, dbName, "standard", new File("machineinfo"));
File machineInfoFile = nutchfs.get(machineInfo);
DataInputStream in = new DataInputStream(new FileInputStream(machineInfoFile));
try {
in.read(); // version
this.totalMachines = in.readInt();
} finally {
in.close();
}
//
// Seize global lock
//
nutchfs.lock(globalWriteLock, true);
// Now we use these locks to resolve any partially-completed
// state directories from a previous run.
// REMIND - mjc - Fixing/defining the db/newdb and tmp-delete
// sequence is the most important next step!
/***
File oldDbDirFile = nutchfs.get(oldDbDir, SHORT_TIMEOUT);
if (oldDbDirFile != null) {
File dbDirFile = nutchfs.get(dbDir, SHORT_TIMEOUT);
if (dbDirFile != null) {
throw new IOException("Impossible condition: directories " + oldDbDir + " and " + dbDir + " cannot exist simultaneously");
}
File newDbDirFile = nutchfs.get(newDbDir, SHORT_TIMEOUT);
if (newDbDirFile != null) {
nutchfs.renameTo(newDbDir, dbDir);
}
nutchfs.delete(oldDbDir);
} else {
File newDbDirFile = nutchfs.get(newDbDir, SHORT_TIMEOUT);
if (newDbDirFile != null) {
nutchfs.delete(newDbDir);
}
}
// Delete any partial edits from last time.
if (nutchfs.get(tmpDir, LONG_TIMEOUT) != null) {
nutchfs.delete(tmpDir);
}
****/
// Load how many machines have started yet. If we're the
// first one, then we have to create the EditSectionWriter
// structures.
int numOpens = 0;
File openCounterFile = nutchfs.get(openCounter, LONG_TIMEOUT);
if (openCounterFile != null) {
in = new DataInputStream(new FileInputStream(openCounterFile));
try {
in.read(); // version
numOpens = in.readInt();
} finally {
in.close();
}
} else {
openCounterFile = nutchfs.getWorkingFile();
}
// Bump number by 1.
DataOutputStream out = new DataOutputStream(new FileOutputStream(openCounterFile));
try {
out.write(OPEN_COUNTER_VERSION);
out.writeInt(numOpens + 1);
} finally {
out.close();
}
nutchfs.put(openCounter, openCounterFile, true);
// Check if we're the first ones to open.
if (numOpens == 0) {
// Build an edit-section for each of the 4 edit types
EditSectionGroupWriter.createEditGroup(nutchfs, dbName, PAGES_BY_URL, totalMachines, EditSectionGroupWriter.URL_KEYSPACE);
EditSectionGroupWriter.createEditGroup(nutchfs, dbName, PAGES_BY_MD5, totalMachines, EditSectionGroupWriter.MD5_KEYSPACE);
EditSectionGroupWriter.createEditGroup(nutchfs, dbName, LINKS_BY_URL, totalMachines, EditSectionGroupWriter.URL_KEYSPACE);
EditSectionGroupWriter.createEditGroup(nutchfs, dbName, LINKS_BY_MD5, totalMachines, EditSectionGroupWriter.MD5_KEYSPACE);
// Remove the flag that tells readers it's OK to proceed
NutchFile dirIsComplete = new NutchFile(dbDir, "dbIsComplete");
nutchfs.delete(dirIsComplete);
}
// These are the NutchFiles for this section of the read-only
// db.
NutchFile sectionDir = new NutchFile(dbDir, "dbsection." + machineNum);
NutchFile pagesByURLNF = new NutchFile(sectionDir, PAGES_BY_URL);
NutchFile pagesByMD5NF = new NutchFile(sectionDir, PAGES_BY_MD5);
NutchFile linksByURLNF = new NutchFile(sectionDir, LINKS_BY_URL);
NutchFile linksByMD5NF = new NutchFile(sectionDir, LINKS_BY_MD5);
//
// Release the global lock
//
nutchfs.release(globalWriteLock);
// Create Readers for the above NutchFiles
this.pagesByURL = new MapFile.Reader(nutchfs.get(pagesByURLNF).getPath(), new UTF8.Comparator());
this.pagesByMD5 = new MapFile.Reader(nutchfs.get(pagesByMD5NF).getPath(), new Page.Comparator());
this.linksByURL = new MapFile.Reader(nutchfs.get(linksByURLNF).getPath(), new Link.UrlComparator());
this.linksByMD5 = new MapFile.Reader(nutchfs.get(linksByMD5NF).getPath(), new Link.MD5Comparator());
// Create writers for new edit-files. We write changes
// into these files, then apply them to the db upon close().
this.pagesByURLWriter = new EditSectionGroupWriter(nutchfs, dbName, machineNum, totalMachines, PAGES_BY_URL, PageInstruction.class, NullWritable.class, new EditSectionGroupWriter.PageURLExtractor());
this.pagesByMD5Writer = new EditSectionGroupWriter(nutchfs, dbName, machineNum, totalMachines, PAGES_BY_MD5, PageInstruction.class, NullWritable.class, new EditSectionGroupWriter.PageMD5Extractor());
this.linksByURLWriter = new EditSectionGroupWriter(nutchfs, dbName, machineNum, totalMachines, LINKS_BY_URL, LinkInstruction.class, NullWritable.class, new EditSectionGroupWriter.LinkURLExtractor());
this.linksByMD5Writer = new EditSectionGroupWriter(nutchfs, dbName, machineNum, totalMachines, LINKS_BY_MD5, LinkInstruction.class, NullWritable.class, new EditSectionGroupWriter.LinkMD5Extractor());
}
/**
* Shutdown
*/
public synchronized void close() throws IOException {
// Process the 4 tables:
// 1. pagesByURL
// 2. pagesByMD5
// 3. linksByMD5
// 4. linksByURL
// 1. Process pagesByURL. Processing this stream will
// generate a number of edits for the pagesByMD5 step.
//
CloseProcessor pagesByURLProcessor = new PagesByURLProcessor(pagesByURL, pagesByURLWriter, pagesByMD5Writer);
long numPBUItems = pagesByURLProcessor.closeDown(tmpDir, newDbDir);
//
// 2. Process the pagesByMD5 edit stream. This will
// make calls to deleteLink(), which are processed later.
//
CloseProcessor pagesByMD5Processor = new PagesByMD5Processor(pagesByMD5, pagesByMD5Writer);
long numPBMItems = pagesByMD5Processor.closeDown(tmpDir, newDbDir);
//
// 3. Process the linksByMD5 edit stream first. This
// will generate a number of edits for the linksByURL
// stream. This also processes the calls to deleteLink()
// that may have been invoked as part of the above call
// to process pagesByMD5.
CloseProcessor linksByMD5Processor = new LinksByMD5Processor(linksByMD5, linksByMD5Writer, linksByURLWriter);
long numLBMItems = linksByMD5Processor.closeDown(tmpDir, newDbDir);
//
// 4. Process the linksByURL edit stream. This will also
// read through the sorted PagesByURL file, and modify
// the Links so that they indicated whether the target
// Page has any outlinks or not.
//
// Duplicate the LINKS_BY_MD5 editsWriter, because the 1st one has
// already been closed.
EditSectionGroupWriter targetOutlinkEditsWriter = new EditSectionGroupWriter(nutchfs, dbName, machineNum, totalMachines, LINKS_BY_MD5, LinkInstruction.class, NullWritable.class, new EditSectionGroupWriter.LinkMD5Extractor());
// Find the just-written dbsection output for PAGES_BY_URL
NutchFile newSectionDir = new NutchFile(newDbDir, "dbsection." + machineNum);
NutchFile newPagesByURLNF = new NutchFile(newSectionDir, PAGES_BY_URL);
CloseProcessor linksByURLProcessor = new LinksByURLProcessor(linksByURL, linksByURLWriter, new MapFile.Reader(nutchfs.get(newPagesByURLNF).getPath(), new UTF8.Comparator()), targetOutlinkEditsWriter);
long numLBUItems = linksByURLProcessor.closeDown(tmpDir, newDbDir);
//
// If the number of linksByURL processed is zero, then
// there's no reason to do all of the following with
// a 2nd pass through linksByMD5.
//
if (numLBUItems != 0) {
//
// 5. Step 4 did several things to the LinksByURL db.
// First, it implemented all the changes generated
// by instructions from LinksByMD5Processor. Second,
// it made lots of calls to setTargetHasOutlink. This
// changes the content of the Link objects.
//
// So now we need to reconstruct the LinksByMD5
// list, using the Links we created in step #4.
//
NutchFile newLinksByMD5NF = new NutchFile(newSectionDir, LINKS_BY_MD5);
MapFile.Reader linksByMD5ForStageTwo = new MapFile.Reader(nutchfs.get(newLinksByMD5NF).getPath(), new Link.MD5Comparator());
NutchFile stageTwoDbDir = new NutchFile(newDbDir, "stage2.subdir");
CloseProcessor linksByMD5StageTwoProcessor = new LinksByMD5Processor(linksByMD5ForStageTwo, targetOutlinkEditsWriter, null);
numLBMItems = linksByMD5StageTwoProcessor.closeDown(tmpDir, stageTwoDbDir);
//
// 6. Now move the Stage2 LinksByMD5 file up to replace
// the one at the primary level
//
linksByMD5ForStageTwo.close();
NutchFile stageOneLinksByMD5 = new NutchFile(newDbDir, LINKS_BY_MD5);
NutchFile stageTwoLinksByMD5 = new NutchFile(stageTwoDbDir, LINKS_BY_MD5);
nutchfs.delete(stageOneLinksByMD5);
nutchfs.renameTo(stageTwoLinksByMD5, stageOneLinksByMD5);
}
//
// 7. Finally, write out the total num of pages and links
//
//NutchFile newSectionDir = new NutchFile(newDbDir, "dbsection." + machineNum);
NutchFile sectionStats = new NutchFile(newSectionDir, STATS_FILE);
File sectionStatsFile = nutchfs.getWorkingFile();
DataOutputStream out = new DataOutputStream(new FileOutputStream(sectionStatsFile));
try {
//
// These counts are guaranteed to be correct; they're
// based on the counts made during processing of primary-key
// edits. Pages are always counted by URL first, and only
// subsequently by MD5 if there are any edits to make. Links
// are always counted by MD5 first, and only by URL subsequently
// and conditionally.
//
// If there are a bunch of edits that result in no modifications
// to the db, the two sets of counts (one for URL, one for
// MD5) could become out of sync. So we use the ones that
// are sure to be accurate.
//
out.write(CUR_VERSION);
out.writeLong(numPBUItems);
out.writeLong(numLBMItems);
} finally {
out.close();
nutchfs.put(sectionStats, sectionStatsFile, true);
}
// Close down the db-readers
pagesByURL.close();
pagesByMD5.close();
linksByMD5.close();
linksByURL.close();
//////////////////////////////////////////////////////////////
// Now we need to do a distributed-close. It works by
// the "last person out turns off the lights" protocol.
// All the processors but one will exit without doing anything.
// The last one to exit does all the directory moves.
//////////////////////////////////////////////////////////////
//
// First step is to obtain the global writeLock exclusively.
// DBReaders will try to obtain this non-exclusively. That
// way, there can be many readers at once, but these must
// leave before a single process can blow away the directories.
//
nutchfs.lock(globalWriteLock, true);
//
// Read in how many processes have closed already
//
int numCloses = 0;
File closeCounterFile = nutchfs.get(closeCounter, LONG_TIMEOUT);
if (closeCounterFile != null) {
DataInputStream in = new DataInputStream(new FileInputStream(closeCounterFile));
try {
in.read(); // version
numCloses = in.readInt();
} finally {
in.close();
}
} else {
closeCounterFile = nutchfs.getWorkingFile();
}
if (numCloses == totalMachines) {
throw new IOException("All the processors have already shut down. Impossible condition!");
}
// Bump that number by 1.
out = new DataOutputStream(new FileOutputStream(closeCounterFile));
try {
out.write(CLOSE_COUNTER_VERSION);
out.writeInt(numCloses + 1);
} finally {
out.close();
}
nutchfs.put(closeCounter, closeCounterFile, true);
// Check if this processor is the last one to close.
if (numCloses == totalMachines - 1) {
// Delete edits that might still be lingering around...
for (int i = 0; i < totalMachines; i++) {
new EditSectionGroupReader(nutchfs, dbName, PAGES_BY_URL, i, totalMachines).delete();
new EditSectionGroupReader(nutchfs, dbName, PAGES_BY_MD5, i, totalMachines).delete();
new EditSectionGroupReader(nutchfs, dbName, LINKS_BY_URL, i, totalMachines).delete();
new EditSectionGroupReader(nutchfs, dbName, LINKS_BY_MD5, i, totalMachines).delete();
}
// Complete directories and move them into place
nutchfs.completeDir(tmpDir);
nutchfs.completeDir(dbDir);
nutchfs.completeDir(newDbDir);
//
// Write out the "complete" flag, which tells
// readers it's OK to proceed
//
File dirIsCompleteFile = nutchfs.getWorkingFile();
NutchFile dirIsComplete = new NutchFile(newDbDir, "dbIsComplete");
out = new DataOutputStream(new FileOutputStream(dirIsCompleteFile));
try {
out.writeInt(IS_COMPLETE); // Magic number
} finally {
out.close();
}
nutchfs.put(dirIsComplete, dirIsCompleteFile, true);
// Here we need to 'finish' the db operation.
// That involves: 1. Removing the tmpdir.
// 2. Moving the dbDir to oldDbDir
// 3. Renaming the newDbDir to dbDir
// 4. Removing the oldDbDir
//
// 1.
nutchfs.delete(tmpDir);
// 2.
nutchfs.renameTo(dbDir, oldDbDir);
// 3.
nutchfs.renameTo(newDbDir, dbDir);
// 4.
nutchfs.delete(oldDbDir);
}
// Done.
nutchfs.release(globalWriteLock);
nutchfs.release(localWriteLock);
}
/////////////////////
// Methods for adding, and managing, db operations
////////////////////
/**
* Add a page to the page database
*/
public synchronized void addPage(Page page) throws IOException {
// The 2nd (byMD5) part is handled during processing of the 1st.
pagesByURLEdits++;
piwriter.appendInstructionInfo(pagesByURLWriter, page, ADD_PAGE, NullWritable.get());
}
/**
* Add a page to the page database, with a brand-new score
*/
public synchronized void addPageWithScore(Page page) throws IOException {
// The 2nd (byMD5) part is handled during processing of the 1st.
pagesByURLEdits++;
piwriter.appendInstructionInfo(pagesByURLWriter, page, ADD_PAGE_WITH_SCORE, NullWritable.get());
}
/**
* Don't replace the one in the database, if there is one.
*/
public synchronized void addPageIfNotPresent(Page page) throws IOException {
// The 2nd (index) part is handled during processing of the 1st.
pagesByURLEdits++;
piwriter.appendInstructionInfo(pagesByURLWriter, page, ADD_PAGE_IFN_PRESENT, NullWritable.get());
}
/**
* Don't replace the one in the database, if there is one.
*
* If we do insert the new Page, then we should also insert
* the given Link object.
*/
public synchronized void addPageIfNotPresent(Page page, Link link) throws IOException {
// The 2nd (index) part is handled during processing of the 1st.
pagesByURLEdits++;
piwriter.appendInstructionInfo(pagesByURLWriter, page, link, ADD_PAGE_IFN_PRESENT, NullWritable.get());
}
/**
* Remove a page from the page database.
*/
public synchronized void deletePage(String url) throws IOException {
// The 2nd (index) part is handled during processing of the 1st.
Page p = new Page();
p.setURL(url);
pagesByURLEdits++;
piwriter.appendInstructionInfo(pagesByURLWriter, p, DEL_PAGE, NullWritable.get());
}
/**
* Add a link to the link database
*/
public synchronized void addLink(Link lr) throws IOException {
linksByMD5Edits++;
liwriter.appendInstructionInfo(linksByMD5Writer, lr, ADD_LINK, NullWritable.get());
}
/**
* Remove links with the given MD5 from the db.
*/
private synchronized void deleteLink(MD5Hash md5) throws IOException {
linksByMD5Edits++;
liwriter.appendInstructionInfo(linksByMD5Writer, new Link(md5, 0, "", ""), DEL_LINK, NullWritable.get());
}
/**
* The WebDBWriter.main() provides some handy methods for
* testing the WebDB.
*/
public static void main(String argv[]) throws FileNotFoundException, IOException {
if (argv.length < 2) {
System.out.println("Usage: java net.nutch.db.DistributedWebDBWriter <db> [-create <numProcessors>] | <machineInt> ([-addpage id url] | [-addpageifnp id url] | [-deletepage url] | [-addlink fromID url] | [-deletelink fromID])");
return;
}
NutchFileSystem nutchfs = new NutchNFSFileSystem(new File(argv[0]), true);
if ("-create".equals(argv[1])) {
DistributedWebDBWriter.createDB(nutchfs, "db", Integer.parseInt(argv[2]));
System.out.println("Created webdb at " + argv[0]);
} else {
int machineNum = Integer.parseInt(argv[1]);
String cmd = argv[2];
if ("-addpage".equals(cmd)) {
MD5Hash md5 = new MD5Hash(argv[3]);
String url = argv[4];
DistributedWebDBWriter writer = new DistributedWebDBWriter(nutchfs, "db", machineNum);
Page page = new Page(url, md5);
writer.addPageWithScore(page);
System.out.println("Added page (with score): " + page);
writer.close();
} else if ("-addpageifnp".equals(cmd)) {
MD5Hash md5 = new MD5Hash(argv[3]);
String url = argv[4];
DistributedWebDBWriter writer = new DistributedWebDBWriter(nutchfs, "db", machineNum);
try {
Page page = new Page(url, md5);
writer.addPageIfNotPresent(page);
System.out.println("Added page: " + page);
} finally {
writer.close();
}
} else if ("-deletepage".equals(cmd)) {
String url = argv[3];
DistributedWebDBWriter writer = new DistributedWebDBWriter(nutchfs, "db", machineNum);
try {
writer.deletePage(url.trim());
System.out.println("Deleted item(s)");
} finally {
writer.close();
}
} else if ("-addlink".equals(cmd)) {
MD5Hash fromID = new MD5Hash(argv[3]);
String url = argv[4];
DistributedWebDBWriter writer = new DistributedWebDBWriter(nutchfs, "db", machineNum);
try {
Link link = new Link(fromID, MD5Hash.digest("randomstring.com").halfDigest(), url, "SomeRandomAnchorText_" + System.currentTimeMillis());
writer.addLink(link);
System.out.println("Added link: " + link);
} finally {
writer.close();
}
} else if ("-deletelink".equals(cmd)) {
MD5Hash fromID = new MD5Hash(argv[3]);
DistributedWebDBWriter writer = new DistributedWebDBWriter(nutchfs, "db", machineNum);
try {
writer.deleteLink(fromID);
System.out.println("Deleted item(s)");
} finally {
writer.close();
}
} else {
System.out.println("Sorry, no command with name " + argv[1]);
}
}
}
}